# -*- coding: utf-8 -*-
import MySQLdb
from datetime import datetime
import time
import scrapy
from scrapy import Request

from urllib import parse

import re

from horsveision.items import CsrcItem


class FinanceSpider(scrapy.Spider):
    name = 'finance'
    allowed_domains = ['finance.ce.cn']
    start_urls = ['http://finance.ce.cn/']
    db = MySQLdb.connect('localhost', 'root', '698a80e4212ba38f', 'toujiao', charset='utf8', use_unicode=True)
    # db = MySQLdb.connect('192.168.100.101', 'root', 'root', 'haohsi', charset='utf8', use_unicode=True)

    def parse(self, response):
        content_url_list = response.css(".w_650 .pictxt1 .txt1 h4").extract()
        content_url_list.pop(0)
        for content_url in content_url_list:
            url_list = re.findall(r'(?:http)(?:./)+[^=]+shtml', content_url)
            if len(url_list):
                url = url_list[0]
            else:
                url = re.findall(r'(?:./)+[^=]+shtml', content_url)[0]
            yield Request(url=parse.urljoin(response.url, url), callback=self.parse_detail, dont_filter=True)

    def parse_detail(self, response):
        now_date = datetime.now().strftime("%Y年%m月%d日")
        now_time = time.mktime(time.strptime(now_date, "%Y年%m月%d日"))
        reslease_date = response.css("#articleTime::text").extract()[0].strip()
        reslease_date = re.split(' ', reslease_date)
        reslease_time = time.mktime(time.strptime(reslease_date[0], "%Y年%m月%d日"))
        title = response.css("#articleTitle::text").extract()[0].strip()
        if int(reslease_time) == int(now_time):
            with self.db.cursor() as cursor:
                sql = "SELECT title FROM spider_content WHERE title='%s'" % title
                cursor.execute(sql)
                result = cursor.fetchone()
                if result is None:
                    finance_item = CsrcItem()

                    content_list = response.css("#articleText .TRS_Editor p").extract()
                    content_all = ''.join(content_list)
                    source = "中国经济网"

                    finance_item['title'] = title
                    finance_item['content'] = content_all
                    finance_item['source'] = source
                    finance_item['send_time'] = int(reslease_time)
                    finance_item['cover_img'] = ''
                    yield finance_item
